#delimit;
*****************************;
*	Name:	youngchild_ZAF.do ;
*	Author: Taryn Dinkelman;
*	Date: 22 July 2006 ;
*		--last update: 29 July 2006, t.d., 21 Aug by ZM;
*	Description: cleans South Africa guardian file (ages 15+) data and ;
* 			 prepares variables for propensity score logits ;
*
*	Input file : paref32004.dta;
* 	Output files: youngchild_ZAF;
******************************* ;
* note: the first part of this file cleans up the variables for kids 2-11;
* the second part cleans up some of the relevant vars of the responding;
* guardian of the child. However, we still to merge in some of the X's;
* from the parents/heads of hh, IF These parents or head of hh were interviewed in ;
* the detailed adult module;

use "$stata/paref32004.dta", clear;
display "$S_DATE @ $S_TIME";

count;
sum;

************** PART 1 - CHILD INFO ******************;



*******AGE, GENDER, RACE, PROVINCE, RURAL/URBAN, LANG and NATIONALITY OF KIDS *****;
*creates age, age squared, and 5-yr. age bracket dummies;
*range of original age variable (q2_1): 2-11 plus neg values for missings;
tab q2_1, mi;

*check that age matches age group vars 	/*they do*/;
foreach var of varlist agec agefive agefveb { ;
	tab q2_1 `var', mi;
} ;	

gen age=q2_1;
replace age=. if age<0;
gen agesq=age*age;
replace agesq=0 if age==.;		/*I'll set age=0 for missing obs later*/
lab var age "age at last birthday";
lab var agesq "age (at last birthday) squared";
tab age, mi;
tab agesq, mi;

gen agedisc=.;
lab var agedisc "variable age discretized into 5-yr age groups";
replace agedisc=1 if age>=0 & age<=4;
replace agedisc=2 if age>=5 & age<=9;
replace agedisc=3 if age>=10 & age<=14;
lab def agedisc 1 "0-5" 2 "5-10" 3 "10-15" 4 "15-20" 5 "20-25" 6 "25-30" 7 "30-35" 8 "35-40" 9 "40-45" 10 "45-50" 11 "50-55" 12 "55-60" 13 "60-65"
			14 "65-70" 15 "70-75" 16 "75-80" 17 "80-85" 18 "85-90" 19 "90-95" 20 "95-99";
lab val agedisc agedisc;

*creates sex dummies; 
tab q2_2, mi;
gen female=.;
replace female=1 if q2_2==2;
replace female=0 if q2_2==1;
lab var female "female indicator";
lab def female 0 "male" 1 "female";
tab female, mi;

* creates race variable;
tab q2_3, mi;
tab q2_3 if q2_3>0, gen(race);  	/*no observations coded as "other", some missing*/
local i=1;
foreach race in african white coloured indian { ;
	ren race`i' `race';
	replace `race'=0 if `race'==.;
	lab var `race' "dummy for `race'";
	lab define `race' 1 "`race'" 0 "not `race'";
	local i=`i'+1;
	} ;

gen race=q2_3 if q2_3>0;	/*to be used in creation of race_d dummy later*/;

*province and rural/urban;
*creates province & rural/urban dummies;
tab province, mi;
tab provc, mi;
drop provc;
tab province, gen(province);
*local i=1;
*foreach province in west_cape east_cape north_cape free_state kwazulu_natal north_west gauteng mpumalanga limpopo { ;
*	ren province`i' `province';
*	replace `province'=0 if `province'==.;
*	lab var `province' "dummy for `province' province";
*	local i=`i'+1;
*	} ;

*creates two types of rural/urban dummies: a binary one ("urban") and others with more specific categories;
gen urban=.;
replace urban=1 if geotype==1|geotype==2;
replace urban=0 if geotype==3|geotype==4;
lab var urban "dummy==1 if urban resident";

tab geotype, gen(geo);
local i=1;
foreach geotype in urban_formal urban_informal tribal rural { ;
	ren geo`i' `geotype';
	replace `geotype'=0 if `geotype'==.;
	lab var `geotype' "dummy for `geotype'";
	local i=`i'+1;
	} ;

* citzenship and language;
*creates dummies for citizenship & language;
gen citizen=.;
replace citizen=1 if q2_4==1;
replace citizen=0 if q2_4==2;
lab var citizen "dummy for South African citizen ('nationality' in survey)";
lab def citizen 0 "not South African citizen" 1 "South African citizen";
tab citizen, mi;

tab q2_5, mi;
tab q2_5 if q2_5>0, gen(language);
local i=1;
foreach language in afrikaans english isindebele isiswati isixhosa isizulu sesotho sepedi setswana tshivenda xitsonga oth_european indian other { ;
	replace language`i'=0 if language`i'==.;
	lab var language`i' "dummy for `language' speaker (spoken most often at home)";
	local i=`i'+1;
	} ;
egen oth_lang=rsum(lang*12 lang*13 lang*14);	/*groups languages for which there are few obs*/
lab var oth_lang "dummy for other_african, oth_european, indian, or other speaker (spoken most often at home)";
drop lang*12 lang*13 lang*14;
gen lang=q2_5 if q2_5>0; 	/*to be used in creation of lang_d dummy later*/


*********** TESTED FOR HIV & TEST RESULT **************;
*creates indicator for HIV test;
*tested=0 if original tested variable (finresfh)=2 (refused), 3 (absent) or 4 (missing);
*but original tested variable (finresfh) takes only values 1 (tested) or 2 (refused);
*original result variable (hivstat) takes only values 1 (positive) or 2 (negative);
tab finresfh, mi;
gen tested=.;
replace tested=1 if finresfh==1;
replace tested=0 if finresfh==2|finresfh==3|finresfh==4;

tab hivstat finresfh, mi;
gen result=.;
replace result=1 if hivstat==1;
replace result=0 if hivstat==2;
lab var tested "took HIV test";
lab var result "HIV test result";
lab define tested 0 "did not take HIV test (refused, absent or missing)" 1 "took HIV test";
lab def result 0 "negative" 1 "positive";

*this confirms that no results available from respondents who did not take test;
tab tested result, mi;

******************* EDUCATION ******************* ;
* there is no corresponding question asking "are you enrolled" to the child;
* infer from the question "attend school" whether the child is enrolled or not;
* and double check this with the question on enrollment in the hh questionnaire;
* once we can merge indiv to hh's;

tab q4_2, mi;
gen enrolled=.;
lab var enrolled "dummy for currently enrolled in school";
replace enrolled=1 if q4_2==1;
replace enrolled=0 if q4_2==2;

* create variable for attendance separately;
gen attend=.;
replace attend=enrolled;
lab var attend "child attends school";

* there is no corresponding question about reading for constructing literacy;
*create variable for years of education completed and corresponding dummies;
*Impute values from Taryn's approximations;
tab q2_6, mi;
gen educyrs=.;
replace educyrs=0	if q2_6==1 	;	/*no schooling*/	
replace educyrs=3	if q2_6==2	;	/*up to std1 or gr3 or abet1*/	
replace educyrs=5	if q2_6==3	;	/*std 2-std3 or gr5 or abet 2	*/
replace educyrs=7	if q2_6==4	;	/*std 4-std5 or gr6-gr7 or abet3*/	
lab var educyrs "education in years, imputed from q2_6";
tab educyrs q2_6, mi;
gen education = q2_6;

tab educyrs, gen(educ);
forvalues i = 1/4 {;
	replace educ`i'=0 if educ`i'==.;
};
lab var educ1 "dummy for no_school";
lab var educ2 "dummy for primary school<=std1";
lab var educ3 "dummy for primary school<=std3";
lab var educ4 "dummy for primary school<=std5";


********ORPHAN STATUS*************;

*note mother and father dummies already in dataset, but treat "don't knows" as missing;
tab q2_7, mi;
tab q2_7 if q2_7>0, gen(mother);
gen moth=q2_7		/*to be used to create moth_d dummy later*/;
lab var mother1 "dummy=1 if mother alive (q2_7)";
lab var mother2 "dummy=1 if mother dead (q2_7)";
lab var mother3 "dummy=1 if doesn't know if mother alive (q2_7)";

tab q2_9, mi;
tab q2_9 if q2_9>0, gen(father);
gen fath=q2_9 	/*to be used to create fath_d dummy later*/;
lab var father1 "dummy=1 if father alive (q2_9)";
lab var father2 "dummy=1 if father dead (q2_9)";
lab var father3 "dummy=1 if doesn't know if father alive (q2_9)";

local i =1 ;
while `i'<=3 {;
	replace mother`i'=0 if mother`i'==.;
	replace father`i'=0 if father`i'==.;
	local i = `i'+1;
};

gen orphan=0;
replace orphan=1 if mother2==1 & father2==1;
lab var orphan "dummy=1 if both mother & father dead";

tab age q2_8 ;
tab age q2_10;



*******SEXUAL BEHAVIOR, INJECTIONS & ALCOHOL USE*****************;
* no relevant questions for kids 2-11;

**********HIV/AIDS KNOWLEDGE & PERCEPTIONS*********;
* some relevant questions, reported by guardian;

* does kid know about HIV? has sex been discussed with kid?;
tab q4_13, mi;
gen sex_discuss=q4_13 if q4_13>0	/*to create missing dummy later*/;
tab sex_discuss, gen(sex_discuss);
lab var sex_discuss "from q4_13, have you discussed sex with this child?";
lab var sex_discuss1 "guardian of 2-11 yr old has discussed sex with child";
lab var sex_discuss2 "guardian of 2-11 yr old has not discussed sex with child";
lab var sex_discuss3 "no response: have you discussed sex with this child?";
local i =1 ;
while `i'<=3 {;
	replace sex_discuss`i'=0 if sex_discuss`i'==.;
	local i = `i'+1;
};

* does kid know about HIV? Only asked if sexual abuse question ;
* was not answered 'yes';
tab q4_16, mi;
gen hivhear=q4_16 if q4_16>0&q4_16<9999;
recode hivhear (2=0);
lab var hivhear "1=guardian knows kid has heard of HIV/AIDS before, 0=has not";


************** PART 2 - GUARDIAN INFO ******************;

************** SEX, AGE, RELATIONSHIP, DEPENDENTS ****** ;
* note: guardian race not asked in this module;

tab q1_1 q1_2, mi;

gen biol_parent=1 if q1_1==1;
replace biol_parent=0 if q1_1==2;

gen nonbiol_g=1 if q1_2==1 & biol_parent!=1;
replace nonbiol_g=0 if q1_2==2;

tab q1_3, mi;

gen guardian=.;
lab def guardian 
1 "biol_parent"
2 "adoptive parent"
3 "step-parent"
4 "grandparent"
5 "sibling"
6 "other family"
7 "unrelated guardian"
8 "other unrelated"
9 "not the guardian";
lab val guardian guardian;

replace guardian=1 if biol_parent==1 & guardian==.;
replace guardian=q1_3+1 if q1_3>=1&q1_3<=7 & guardian==.;
replace guardian=7 if guardian==. & q1_2==1;
* note: there are 46 obs who report they are the guardian and then;
* don't report a relationship - set these as 'unrelated guardians';
replace guardian=9 if guardian==. & q1_2==2;
* some guardian obs are missing;
tab guardian, mi;
tab guardian if guardian!=., gen(guardian);
local i = 1 ;
while `i'<=9 {;
	replace guardian`i'=0 if guardian`i'==.;
	local i = `i'+1;
};

* person number of parent/guardian;
gen guardian_number=q1_6 if q1_6>0;
lab var guardian_number "personnum of guardian/parent in roster";

gen guardian_survey=q1_7 if q1_7>0 & q1_7<=2;
recode guardian_survey (2=0);
lab var guardian_survey "1=guardian was also selected for adult module";

* sex of guardian answering questions;
tab q1_9, mi;
gen female_g=.;
replace female_g=1 if q1_9==2;
replace female_g=0 if q1_9==1;
lab var female_g "female indicator (q1_9)";
lab val female_g female;

* age of guardian;
tab q1_8, mi;
* there are some misreports here - probably reporting the child's age instead of the adults age;
* set age of guardian to missing if guardian age is = or less than child age;
* and set all guardian ages<=11 to zero;
gen age_g=q1_8 if q1_8>0;
replace age_g=. if age>age_g & age_g>0;
replace age_g=. if age_g<=11;
lab var age_g "age of guardian of child 2-11 at guardian's last birthday";

gen agesq_g=age_g*age_g;
lab var agesq "age of guardian at last birthday, squared";

* no clear reason to discretize the age of the guardian;

* number of own biol and other dependent children;
tab1 q1_10-q1_13, mi;
gen kids=q1_10 if q1_10>=0;
lab var kids "total biol kids of guardian (q1_10)";
gen dept_kids=q1_11 if q1_11>=0&q1_11<9999;
lab var dept_kids "total biol kids dept on guardian (q1_11)";
	* check for consistency:;
cap noisily assert dept_kids<=kids if kids<. & dept_kids<.;

tab q1_13 q1_12, mi ;
* there are some inconsistencies here;
gen dept_nonkids=q1_13 if q1_12==1 & q1_13>=0;
replace dept_nonkids=0 if q1_12==2;
replace dept_nonkids=0 if q1_12==1 & q1_13==0;
notes dept_nonkids: some guardians with nonkid depts report nonkids dependent number ;
lab var dept_nonkids "total non-kids dependents";

gen dept_total=dept_nonkids+dept_kids;
lab var dept_total "total dependents of this guardian, created";
tab dept_total, mi;
* note: this variable doesn't make too much sense, b'c every person;
* answering should have at least 1 dependent (2-11), and there are missing;
* vals for some guardians;


**********HIV/AIDS KNOWLEDGE & PERCEPTIONS*********;
* For the HIV prevalence paper, we want to assign to the CHILD;
* the knowledge that the GUARDIAN reports. For the schooling paper,;
* it's not necessary to have info on what kids may know about HIV.;

* For HIV knowledge, we would like to use this module instead of the ;
* matched info from the adultyouth module, because there will only;
* be a subset of guardians responding to that module.;

* However, the way the questions were asked of the guardian vs. the;
* adults is different - guardians are asked to volunteer ANY transmissionn;
* method or protection method, while adults are asked y/n/dnk for EACH;
* option. The variables would not be comparable.;

* It is also not possible to generate a 'sufficient' knowledge variable;
* for the guardian - while the guardian is able to identify prevention ;
* methods, the person is never asked to reject a set of misconceptions, and ;
* so we can only tell whether they report a misconception or not.;

* Decision 1: rely on the matched data from the adult module for;
* sexual behavior vars and hiv knowledge and info vars from the guardian;
* although this will only be for a subset of guardians (75%).; 


*************IMPUTE 0 for MISSING VALS, and CREATE DUMMIES FOR MISSING VALUES *****************************;
sum urban geotype province tested;
* none of these vars need missing dums;

* create dummies;
global newvars "age agedisc female race
		citizen lang enrolled attend educyrs moth fath
		sex_discuss hivhear biol_parent nonbiol_g guardian
		guardian_survey female_g age_g agesq_g kids dept_kids
		dept_nonkids dept_total";

foreach var of global newvars {;
	gen `var'_d=0;
	lab var `var'_d "dummy==1 if `var' was imputed due to missing or not applicable data";
	replace `var'_d=1 if `var'==.;
	replace `var'=0 if `var'==.;
	tab `var' `var'_d;
};

gen orphan_d=1 if moth_d==1|fath_d==1;
replace orphan_d=0 if orphan_d==.;
lab var orphan_d "dummy==1 if `var' was imputed due to missing data";

* cross check with the missing dummies with the missing values;
local ygrp="age_d agedisc_d female_d race_d citizen_d lang_d enrolled_d";
local xgrp="q2_1 q2_1 q2_2 q2_3 q2_4 q2_5 q4_2";

local n : word count `xgrp';
forvalues i = 1/`n' { ;
	local x : word `i' of `xgrp';
	local y : word `i' of `ygrp';
*	tab `x'  `y', mi;
	di "comparing `x' and `y'";	
	cap noisily assert `y'==1 if `x'<0|`x'==9999;

} ;



local ygrp="attend_d educyrs_d moth_d fath_d sex_discuss_d hivhear_d";
local xgrp="q4_2 q2_6 q2_7 q2_9 q4_13 q4_16";

local n : word count `xgrp';
forvalues i = 1/`n' { ;
	local x : word `i' of `xgrp';
	local y : word `i' of `ygrp';
*	tab `x'  `y', mi;
	di "comparing `x' and `y'";	
	cap noisily assert `y'==1 if `x'<0|`x'==9999;
} ;

local ygrp="biol_parent_d nonbiol_g_d guardian_d guardian_survey_d";
local xgrp = "q1_1 q1_2 q1_3 q1_7";

local n : word count `xgrp';
forvalues i = 1/`n' { ;
	local x : word `i' of `xgrp';
	local y : word `i' of `ygrp';
	tab `x'  `y', mi;
} ;


local ygrp="female_g_d age_g_d agesq_g_d kids_d dept_kids_d dept_nonkids_d";
local xgrp="q1_9 q1_8 q1_8 q1_10 q1_11 q1_13";

local n : word count `xgrp'`j';
forvalues i = 1/`n' { ;
	local x : word `i' of `xgrp'`j';
	local y : word `i' of `ygrp'`j';
	tab `x'  `y', mi;
} ;

* notes:;
* discrepancy btwn q1_2 and nonbiol_g_d is a data/skip pattern issue (tab q1_2 q1_1);	
* discrepancy btwn q1_7 and guardian_survey_d is a data disc. (tab q1_7);
* discrepancy btwn q1_8 and age_g is b'c I set implausible ages of guardians;
	* to =0;
* discrepancy btwn q1_13 and dept_nonkids is a data/skip pattern issue: tab q1_13 q_12;


*check distribution of number of missing vars per observation;
egen num_mis=rsum(*_d);
lab var num_mis "number of missing values for vars included in logit, by obs";
tab num_mis, mi;
*note that num_mis records the number of missing, not applicable, or no response 
	variables per observation among those variables created in this do-file;
*num_mis should be treated as a rough guide to the extent of data imputation in this dataset,
	not a definitive tally of missing vars or obs;


*************CLEAN UP **********************;
* beware when appending - questions with same label (q1_1) will correspond to;
* difft questions in the survey;
*replace negative values equal to missing;
foreach var of varlist vpno houseno pnresp qa qb agefive agefveb hivstat ireal1 ibreal1 sexact { ;
	replace `var'=. if `var'<0;
};
*drop variables that have been renamed and cleaned;
drop q2_1 q2_2 q2_3 q2_4 q2_5 q4_2 q2_7 q2_9 q4_13 q4_16 q1_1 q1_2 q1_3 q1_6 q1_7 q1_9 q1_8 q1_10 q1_11 q1_12 q1_13;
*drop additional variables we aren't using right now;
forvalues i = 1/4 {;
	drop q`i'*;
};
drop q6*;
drop q7*;
*drop string variables and interview variables we don't need;
drop projno interno superno nosp sexact times timec finresq munc eac ;
*drop placeholder variables;
drop moth fath lang race;
assert id==idk;
drop idk;
qui compress;
label data "ZAF data from paref32004.dta, cleaned $S_DATE @ $S_TIME by td";
save "$data/youngchild_ZAF.dta", replace;

cap log close;
